/* 
    Purpose: Create parental income scores 
             (retrospective, both parents) for
             adult child respondents. Put blended 
             measures in 2015 dollars. Save analysis sample
             (i.e. respondents that have predicted parental
             income, actual income, and parental actual income).
    
    Note: All measures of income must be logged (otherwise 
          log-level models will be created in 3a).

    Creates: PSID_incscores_bothparents.dta
*/

clear 
set more off

cd "$Mydirectory1/1_DataSources/PSID"

    use ./output/PSID_bothparents_clean, clear

***************************************************************************************
*  Part 1: FATHER INCOME (RETROSPECTIVE)
***************************************************************************************

/*Preliminary step: Give all survey respondent with father_notworking =1
                    an occupation code of "99". Allows Census income 
                    scores to be merged. */
    assert fatheroccej==. if father_notworking==1 
    replace fatheroccej =99 if father_notworking==1
    tab father_notworking fatheroccej if father_notworking==1, m 
    tab fatheroccej, m

*---------------------------------
* CENSUS INCOME SCORES 
*---------------------------------
* 1. Occupation x race x south
    preserve
        use "../CensusData/output/IncomeScores_Coarsened_byrace_bysouth.dta", clear
        sort fatheroccej race south_merge
        tempfile census_incscores
        save `census_incscores'
    restore

    sort fatheroccej race south_merge  
    merge m:1 fatheroccej race south_merge using `census_incscores'
    assert fatheroccej==. | race==. | south_merge==. if _merge==1 
    drop if _merge==2
    drop _merge *altwgt flag* *CWfix avgincwage* *_201* avg_inctot_2000_byocc_byr_bys avg_inctot_1960_byocc_byr_bys avg_inctot_1970_byocc_byr_bys avg_inctot_1980_byocc_byr_bys avg_inctot_1990_byocc_byr_bys

*----------------------------------*
* SUPPLEMENTAL 1936 INCOME SCORES
*----------------------------------*
    preserve
        use "../ConsumptionSurvey_1936/output/ConsumptionSurvey_1936_IncomeScores.dta", clear
        sort fatheroccej race south_merge
        tempfile census_1936incscores
        save `census_1936incscores'
    restore

    merge m:1 fatheroccej race south_merge using `census_1936incscores'
    assert inlist(fatheroccej,.,99) | race==. | south_merge==. if _merge==1 
    drop if _merge==2
    drop _merge number_obs_cell_1936 flag_impute_1936 avg_totfaminc_1936_altwt avg_totfaminc_1936_byocc_byr avg_totfaminc_1936_byocc avg_totfaminc_1936_byocc_bys avg_totfaminc_1936_byr_bys avg_totfaminc_1936_bysouth avg_totfaminc_1936_byrace

*-----------------------------------------------*
* SUPPLEMENTAL INCOME SCORES FOR FARMERS (1900) *
*-----------------------------------------------*
    gen occ1950ej_PH = fatheroccej
    replace occ1950ej_PH = 28 if fatheroccej==21 //No self employment distinction in Preston & Haines data

    preserve
        use "../1900_IncomeScores/output/IncomeScores_1900_byrace_bysouth.dta", clear
        sort occ1950ej_PH race south_merge
        tempfile census_1900incscores
        save `census_1900incscores'
    restore
    
    merge m:1 occ1950ej_PH race south_merge using `census_1900incscores' 
    assert inlist(fatheroccej,.,99) | race==. | south_merge==. if _merge==1
    drop if _merge==2
    drop _merge occ1950ej_PH income_PH_farmfix_byocc income_PH_farmfix_byrace netearn00_adj_byrace netearn00_adj_byocc income_PH_just_race income_PH_just_south income_PH_just_race_south income_PH_farmfix_bysouth netearn00_adj_bysouth

*----------------------------------------------------------------*
* PUT ALL RELEVANT FATHER INCOME SCORES IN 2015$ BEFORE LOGGING
*----------------------------------------------------------------*

/*
   Convert income variables to 1950 dollars using the CPI: 
   Source: https://data.bls.gov/timeseries/CUUR0000SA0 
*/
    gen CPI1950 = 24.1 
    gen CPI2015 = 237.017

    global inc1950 "avg_HHinc_1940_byrace_bysouth avg_inctot_1950_byocc_byr_bys avg_HHinc_1960_byocc_byr_bys avg_HHinc_1970_byocc_byr_bys avg_HHinc_1980_byocc_byr_bys avg_HHinc_1990_byocc_byr_bys avg_HHinc_2000_byocc_byr_bys avg_totfaminc_1936 income_PH_farmfix_byr_bys netearn00_adj_byr_bys"

    foreach i of global inc1950 {
        replace `i' = `i' * (CPI2015/CPI1950)
    }

    drop CPI*

*---------------------------------------------------*
* CREATE LOGGED HH INCOME AND 1936 INCOME SCORES   
*---------------------------------------------------*

    //HH INCOME (occ x race x south level)
            clonevar avg_HHinc_1950_byocc_byr_bys = avg_inctot_1950_byocc_byr_bys 
            label var avg_HHinc_1950_byocc_byr_bys "clone of avg_inctot_1950_byocc_byr_bys"

            forval i=1950(10)2000 {
                gen log_father_`i'_byr_bys = ln(avg_HHinc_`i'_byocc_byr_bys)
                label var log_father_`i'_byr_bys "Logged father's household income, `i', by race by south"
            }

    // Mix 1940 Census data with 1936 survey (occ x race x south level)
    gen father_HHinc_1936fix = avg_HHinc_1940_byrace_bysouth 
    replace father_HHinc_1936fix= avg_totfaminc_1936 if fatheroccej==81 | fatheroccej==21
    label var father_HHinc_1936fix "Father baseline income score, 1936 farm and self-emp"
    
    gen log_father_HHinc_1936fix = ln(father_HHinc_1936fix)
    label var log_father_HHinc_1936fix "Logged father's baseline HH income, 1936 farm and self-emp. fix"

    //1900 income score: mix two 1900 data sources 
    foreach x in byr_bys  {
        gen father_inc_1900_`x' = income_PH_farmfix_`x'
        replace father_inc_1900_`x' = netearn00_adj_`x' if fatheroccej==81
        label var father_inc_1900_`x' "Father income score, 1900 Census of Ag and Preston Haines"
        
        gen log_father_1900_`x' = ln(father_inc_1900_`x')
        label var log_father_1900_`x' "Logged father's income, 1900 Census of Ag and Preston Haines"
    }
    
    drop income_PH_farmfix* netearn00*
    
    assert log_father_1900_byr_bys==. if log_father_HHinc_1936fix==.

*--------------------------------------------------*
/* CREATE BLENDED MEASURE OF INCOME FOR 
  (1) WORKING FATHERS
  (2) ALL FATHERS (WORKING + NON-WORKING) */
*--------------------------------------------------*

gen age10 = birthyear+10

foreach z in log_father_interpolated father_plusnotworking {

    local measure "log_father_HHinc_1936fix"
    clonevar log_father_1940_byr_bys = `measure' 

    gen `z' =.
    
    /* NOTE: Birth cohorts in PSID extract are 
             between 1933 and 1985 */

    /*1933-1950 birth cohorts: Give them a weighted 
                               average of 1940 and 
                               1960 income scores. */
    forval i=1941(1)1960 {
    
        local j=`i'-1940 //# years away from first decade
        local k=20-`j' //# years away from second decade
        
        replace `z' = ((`k'/20)*log_father_1940_byr_bys) + ((`j'/20)*log_father_1960_byr_bys) if age10==`i'
    }   

    /* 1951-1980 birth cohorts: Give them a weighted average
                                of two Censuses closest to when 
                                the survey respondent turned 10. */
    foreach decade1 in 6 7 8 {
        local decade2 = `decade1'+1
    
        forval i=19`decade1'1(1)19`decade2'0 {
        
            local j=`i'-19`decade1'0 //# years away from first decade
            local k=10-`j' //# years away from second decade
            
            replace `z' = ((`k'/10)*log_father_19`decade1'0_byr_bys) + ((`j'/10)*log_father_19`decade2'0_byr_bys) if age10==`i'
        }
    }

    /* 1981-1985 birth cohorts: Give them a weighted average
                                of two Censuses closest to when 
                                the survey respondent turned 10. */
    forval i=1991(1)2000  {
        local j=`i'-1990 //# years away from first decade
        local k= 10-`j' //# years away from second decade

        replace `z' = ((`k'/10)*log_father_1990_byr_bys) + ((`j'/10)*log_father_2000_byr_bys) if age10==`i'
    }

    if "`z'"=="log_father_interpolated" {
        assert `z'!=. if `measure'!=. & fatheroccej!=99
        label var `z' "Logged father's income (retrospective), interpolated for each decade (in 2015$)"
        assert `z'!=. if race!=. & fatheroccej<99 & south_merge!=.
  
        /* SUPER IMPORTANT: EXCLUDE NON-WORKING DADS FROM ALL 
                            BLENDED MEASURES MADE UP TO THIS 
                            POINT. */
        sum `z' if father_notworking==1, d
        replace `z' =. if father_notworking==1
        sum `z' if father_notworking==1

        assert `z'!=. if fatheroccej<99 & race!=. & south_merge!=.        
    }

    if "`z'"=="father_plusnotworking" {
        //Note: Respondents born in 1980s won't have a 2000 census income score.
        assert `z'!=. if log_father_1940_byr_bys!=. & !inrange(birthyear,1980,1989) 
        label var `z' "Logged father's baseline income (interpolated, retro), adding non-working fathers"
        assert fatheroccej==99 if `z'!=log_father_interpolated
    }

    drop log_father_1940_byr_bys

    }
    
***************************************************************************************  
*  Part 2: MOTHER INCOME (RETROSPECTIVE)
***************************************************************************************  

*---------------------------------*
* CENSUS INCOME SCORES 
*---------------------------------*

    label var motheroccej "Mother's coarsened occupation"
    
    preserve
        use "../CensusData/output/IncomeScores_Coarsened_byrace_bysouth_moms.dta", clear

        sort motheroccej race south_merge
        tempfile census_momincscores
        save `census_momincscores'
    restore

    sort motheroccej race south_merge 
    merge m:1 motheroccej race south_merge using `census_momincscores'
    assert motheroccej==. | race==. | south_merge==. if _merge==1 
    drop if _merge==2
    drop _merge mom_avg_inctot_1950_byocc_rs

*----------------------------------------------------------------*
* PUT ALL RELEVANT MOTHER INCOME SCORES IN 2015$ BEFORE LOGGING
*----------------------------------------------------------------*

/*
   Convert income variables to 1950 dollars using the CPI: 
   Source: https://data.bls.gov/timeseries/CUUR0000SA0 
*/
    gen CPI1950 = 24.1 
    gen CPI2015 = 237.017

    global inc1950 "mom_HHinc_byr_bys_CWfix mom_avg_HHinc_1960_byocc_rs mom_avg_HHinc_1970_byocc_rs mom_avg_HHinc_1980_byocc_rs mom_avg_HHinc_1990_byocc_rs"

    foreach i of global inc1950 {
        replace `i' = `i' * (CPI2015/CPI1950)
    }

    drop CPI*

*---------------------------------------
* LOG VARIOUS MOTHER INCOME SCORES
*---------------------------------------
/*Note: At occ x race x south level*/
    //1940 
    gen log_mother_1940_byr_bys  = ln(mom_HHinc_byr_bys_CWfix)
    label var log_mother_1940_byr_bys "Logged mother's household income, 1940, by race by south, CW fix"

    //1950-1990
    forval i=1960(10)1990 {
        gen log_mother_`i'_byr_bys = ln(mom_avg_HHinc_`i'_byocc_rs)
        label var log_mother_`i'_byr_bys "Logged mother's household income, `i', by race by south"
    }

*---------------------------------------------------------*
* CREATE BLENDED MEASURE OF INCOME FOR WORKING MOTHERS
*---------------------------------------------------------*

    gen log_mother_interpolated =.

    /*NOTE: Birth cohorts in PSID extract are 
            between 1933-1985 */
    
    /*1933-1950 birth cohorts: Give them a weighted 
                               average of 1940 and 
                               1960 income scores. */
    forval i=1941(1)1960 {
    
    local j=`i'-1940 //# years away from first decade
    local k=20-`j' //# years away from second decade
    
    replace log_mother_interpolated = ((`k'/20)*log_mother_1940_byr_bys) + ((`j'/20)*log_mother_1960_byr_bys) if age10==`i'
    }

    /* 1951-1980 birth cohorts: Give them a weighted average
                                of two Censuses closest to when 
                                the survey respondent turned 10. */
    foreach decade1 in 6 7 8 {
    local decade2 = `decade1'+1
    
    forval i=19`decade1'1(1)19`decade2'0 {
    
    local j=`i'-19`decade1'0 //# years away from first decade
    local k=10-`j' //# years away from second decade
    
    replace log_mother_interpolated = ((`k'/10)*log_mother_19`decade1'0_byr_bys) + ((`j'/10)*log_mother_19`decade2'0_byr_bys) if age10==`i'
    }
    }

    /*NOTE: Some PSID respondents turn 10 between 1990
            and 2000, but 2000 income scores are not 
            available for mothers. Cannot make a blended
            mother income score for these respondents. */
    
    assert log_mother_interpolated!=. if log_mother_1940_byr_bys!=. & age10!=. & !inrange(birthyear,1980,1989) 
    label var log_mother_interpolated "Logged mother's household income (race x south), interp. 1940-1990, retro, CW fix"

    drop mom_* log_mother_1940_byr_bys log_mother_1960_byr_bys log_mother_1970_byr_bys log_mother_1980_byr_bys log_mother_1990_byr_bys

************************************************************************************
*  Part 3: OTHER PARENTAL INCOME MEASURES AND SAMPLES
************************************************************************************

*---------------------------------------
/* Create logged, interpolated measure 
   for all parents (2 types of dads 
   and mom) */
*---------------------------------------

        gen parent_income_all = log_father_interpolated 
        replace parent_income_all = log_mother_interpolated if parent_income_all==. & log_mother_interpolated!=. 
        replace parent_income_all = father_plusnotworking if parent_income_all==. & father_plusnotworking!=. 
        label var parent_income_all "Baseline (logged) income (retro, interp) for fathers, unemp fathers, and mothers"
  
        drop *_byr_bys avg* *1936fix

*---------------------------------------
/* Create actual income measure
   that combines mom and dad actual 
   income. */
*---------------------------------------

    foreach n in 3 5 {

        gen log_parent_actualinc_`n'years =.
        replace log_parent_actualinc_`n'years = log_mean_fathertotfaminc_`n'years
        replace log_parent_actualinc_`n'years = log_mean_mothertotfaminc_`n'years if log_parent_actualinc_`n'years==. & log_mean_mothertotfaminc_`n'years!=.

        label var log_parent_actualinc_`n'years "(Logged) Parental avg actual income, using `n' years and totfaminc"
    }


*---------------------------------------
/* WINSORIZED versions of 
   actual income. Note: winsorize
   and then log */
*---------------------------------------

    //Step 1: Create unlogged version of actual parental income
    foreach n in 3 5 {
        gen parent_actualinc_`n'years =.
        replace parent_actualinc_`n'years = mean_fathertotfaminc_`n'years
        replace parent_actualinc_`n'years = mean_mothertotfaminc_`n'years if parent_actualinc_`n'years==. & mean_mothertotfaminc_`n'years!=.
        label var parent_actualinc_`n'years "(NOT Logged) Parental avg actual income, using `n' years and totfaminc"
    }

    global winsor_incvars "mean_fathertotfaminc_3years mean_fathertotfaminc_5years parent_actualinc_3years parent_actualinc_5years"
    
    //Step 2: reassign 2.5% of values in either tail
    foreach w of global winsor_incvars {
        if "`w'"=="mean_fathertotfaminc_3years" local newvar "winsor_dadactinc_3"
        if "`w'"=="mean_fathertotfaminc_5years" local newvar "winsor_dadactinc_5"
        if "`w'"=="parent_actualinc_3years" local newvar "winsor_parentactinc_3"
        if "`w'"=="parent_actualinc_5years" local newvar "winsor_parentactinc_5"

        winsor `w',p(0.025) gen(`newvar')
    }

    //Step 3: Log the winsorized variables
    foreach var of varlist winsor_dadactinc_3 winsor_dadactinc_5 winsor_parentactinc_3 winsor_parentactinc_5 {
        gen log_`var' = ln(`var')
    }


*---------------
* Samples
*---------------
 /*NOTE: We only want to use the working 
        dad + working moms measures for 2
        types of respondents:
        (1) Rs with predicted dad income AND
            actual dad income
        (2) Rs with predicted mom income AND 
            actual mom income*/

    foreach n in 3 5 {

        // (1) 
            gen dad_bothincomes_avail_`n'years = (log_mean_fathertotfaminc_`n'years!=. & father_plusnotworking!=.)

        // (2)
            gen mom_bothincomes_avail_`n'years = (log_mean_mothertotfaminc_`n'years!=. & log_mother_interpolated!=.)
    
        gen star_Rs_`n'yrs = (dad_bothincomes_avail_`n'years==1) | (mom_bothincomes_avail_`n'years==1)
        tab star_Rs_`n'yrs,m

        label var star_Rs_`n'yrs "R w/ both kinds of father inc OR both kinds of mother inc--`n' yrs actual inc" 
    
    }

    //Samples w/o both kinds of parental income
    foreach n in 3 5 {

        // (1) Rs with actual father inc but not predicted
            gen dadactual_nopred_`n'yrs = (log_mean_fathertotfaminc_`n'years!=. & log_father_interpolated==.)
            label var dadactual_nopred_`n'yrs "Fathers--actual inc available (`n' years), no pred inc"

        // (2) Rs with predicted father inc but not actual
            gen dadpred_noactual_`n'yrs = (log_mean_fathertotfaminc_`n'years==. & log_father_interpolated!=.)
            label var dadpred_noactual_`n'yrs "Fathers--no actual inc (`n' years), pred inc available"

        // (3) Rs with actual mother inc but not predicted
            gen momactual_nopred_`n'yrs = (log_mean_mothertotfaminc_`n'years!=. & log_mother_interpolated==.)
            label var momactual_nopred_`n'yrs "Mothers--actual inc available (`n' years), no pred inc"

        // (4) Rs with predicted mother inc but not actual
            gen mompred_noactual_`n'yrs = (log_mean_mothertotfaminc_`n'years==. & log_mother_interpolated!=.)
            label var mompred_noactual_`n'yrs "Mothers--no actual inc (`n' years), pred inc available"

    }


    //Analysis samples   
    gen analysis_samp_3yrs_workingdads = (log_mean_totfaminc_3years!=. &  log_mean_fathertotfaminc_3years!=. & log_father_interpolated!=.)
    gen analysis_samp_5yrs_workingdads = (log_mean_totfaminc_5years!=. &  log_mean_fathertotfaminc_5years!=. & log_father_interpolated!=.)

    gen analysis_samp_3yrs_allparents = (log_mean_totfaminc_3years!=. & log_parent_actualinc_3years!=. & parent_income_all!=. & star_Rs_3yrs==1) //NOTE: must have last condition
    gen analysis_samp_5yrs_allparents = (log_mean_totfaminc_5years!=. & log_parent_actualinc_5years!=. & parent_income_all!=. & star_Rs_5yrs==1 ) //NOTE: must have last condition

******************
* SAVE
******************

    * Binned version of birth year 
    /* Note: Need this version because 
             of small sample size per 
             birth year */
    gen birthyear_bin5 = 1 if inrange(birthyear,1946,1950)
        replace birthyear_bin5 = 2 if inrange(birthyear,1951,1955)
        replace birthyear_bin5 = 3 if inrange(birthyear,1956,1960)
        replace birthyear_bin5 = 4 if inrange(birthyear,1961,1965)
        replace birthyear_bin5 = 5 if inrange(birthyear,1966,1970)
        replace birthyear_bin5 = 6 if inrange(birthyear,1971,1975)
        replace birthyear_bin5 = 7 if inrange(birthyear,1976,1981) 
    tab  birthyear_bin5, m

    gen parent_id =.
    replace parent_id = father_id 
    replace parent_id = mother_id if momlink_only==1 

    compress
    save ./output/PSID_incscores_bothparents, replace

